The packages used in this project are: Rio: Chan et al. (2021) Readr: Wickham and Hester (2021) Haven: Wickham and Miller (2021)

Passengers Data

Load data

dat <- import(here("data", "dat.csv")) %>% 
   clean_names() %>% 
   mutate_all(na_if,"")

Clean data

dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$status <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked  <- as.factor(dat$disembarked)

dat <- dat %>% 
 mutate(nationality2 = case_when(nationality == "English" ~ "English",
   nationality == "Irish" ~ "Irish",
   nationality == "American" ~ "American",
   nationality == "Swedish" ~ "Swedish",
   nationality == "Finnish" ~ "Finnish",
   nationality == "Scottish" ~ "Scottish",
   nationality == "French" ~ "French",
   nationality == "Italian" ~ "Italian",
   nationality == "Canadian" ~ "Canadian",
   nationality == "Bulgarian" ~ "Bulgarian",
   nationality == "Croatian" ~ "Croatian",
   nationality == "Belgian" ~ "Belgian",
   nationality == "Norwegian" ~ "Norwegian",
   nationality == "Channel Islander" ~ "Channel Islander",
   nationality == "Welsh" ~ "Welsh",
   nationality == "Swiss" ~ "Swiss",
   nationality == "German" ~ "German",
   nationality == "Danish" ~ "Danish",
   nationality == "Spanish" ~ "Spanish",
   nationality == "Australian" ~ "Australian",
   nationality == "Polish" ~ "Polish",
   nationality == "South African" ~ "South African",
   nationality == "Bosnian" ~ "Bosnian",
   nationality == "Hong Kongese" ~ "Hong Kongese",
   nationality == "Dutch" ~ "Dutch",
   nationality == "Lithuanian" ~ "Lithuanian",
   nationality == "Greek" ~ "Greek",
   nationality == "Portuguese" ~ "Portuguese",
   nationality == "Uruguayan" ~ "Uruguayan",
   nationality == "Chinese" ~ "Chinese",
   nationality == "Slovenian" ~ "Slovenian",
   nationality == "Cape Verdean" ~ "Cape Verdean",
   nationality == "Egyptian" ~ "Egyptian",
   nationality == "Japanese" ~ "Japanese",
   nationality == "Hungarian" ~ "Hungarian",
   nationality == "Bosnian" ~ "Bosnian",
   nationality == "Hong Kongese" ~ "Hong Kongese",
   nationality == "Latvian" ~ "Latvian",
   nationality == "Austrian" ~ "Austrian",
   nationality == "Greek" ~ "Greek",
   nationality == "Mexican" ~ "Mexican",
   nationality == "Sweden" ~ "Sweedish",
   nationality == "Turkish" ~ "Turkish",
   nationality == "Slovenian" ~ "Slovenian",
   nationality == "Guyanese" ~ "Guyanese",
   nationality == "Haitian" ~ "Haitian",
   nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
   nationality == "Unknown" ~ "Unknown",
   TRUE ~ "Other - Multiple", ))

dat <- dat %>% 
   mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))

Descriptives

# Breakdown of passengers by class and gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(gender)) %>% 
   group_by(class, gender) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>%
 kable(caption = "Breakdown of Passengers by Class and Gender",
       col.names = c("Class", "Gender", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Breakdown of Passengers by Class and Gender
Class Gender Count Percent
1st Class Female 153 43.71
1st Class Male 197 56.29
2nd Class Female 112 38.36
2nd Class Male 180 61.64
3rd Class Female 216 30.47
3rd Class Male 493 69.53
# Breakdown of passenger nationalities
dat %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(desc(percent)) %>%
 kable(caption = "Breakdown of Passenger Nationalities",
       col.names = c("Nationality", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Breakdown of Passenger Nationalities
Nationality Count Percent
English 1037 42.36
Irish 361 14.75
American 246 10.05
Other - Multiple 116 4.74
Swedish 99 4.04
Syrian/Lebanese 86 3.51
Finnish 58 2.37
Scottish 49 2.00
French 44 1.80
Italian 41 1.67
Canadian 39 1.59
Bulgarian 33 1.35
Croatian 28 1.14
Belgian 26 1.06
Norwegian 26 1.06
Channel Islander 25 1.02
Welsh 23 0.94
Swiss 22 0.90
German 14 0.57
Danish 11 0.45
Spanish 9 0.37
Australian 7 0.29
Polish 6 0.25
South African 5 0.20
Bosnian 4 0.16
Hong Kongese 4 0.16
Dutch 3 0.12
Greek 3 0.12
Lithuanian 3 0.12
Uruguayan 3 0.12
Chinese 2 0.08
Portuguese 2 0.08
Slovenian 2 0.08
Austrian 1 0.04
Cape Verdean 1 0.04
Egyptian 1 0.04
Guyanese 1 0.04
Haitian 1 0.04
Hungarian 1 0.04
Japanese 1 0.04
Latvian 1 0.04
Mexican 1 0.04
Sweedish 1 0.04
Turkish 1 0.04
# Breakdown of passenger nationalities by class (all)
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(class, nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, desc(percent)) %>%
 kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
       col.names = c("Class", "Nationality", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Breakdown of Passenger Nationalities by Class (All)
Class Nationality Count Percent
1st Class American 195 57.35
1st Class English 51 15.00
1st Class Canadian 27 7.94
1st Class Other - Multiple 14 4.12
1st Class French 10 2.94
1st Class Irish 6 1.76
1st Class Swiss 6 1.76
1st Class German 5 1.47
1st Class Scottish 5 1.47
1st Class Spanish 4 1.18
1st Class Swedish 4 1.18
1st Class Uruguayan 3 0.88
1st Class Belgian 2 0.59
1st Class Italian 2 0.59
1st Class Channel Islander 1 0.29
1st Class Dutch 1 0.29
1st Class Egyptian 1 0.29
1st Class Mexican 1 0.29
1st Class Norwegian 1 0.29
1st Class Polish 1 0.29
2nd Class English 145 51.06
2nd Class Other - Multiple 25 8.80
2nd Class American 24 8.45
2nd Class Channel Islander 12 4.23
2nd Class Irish 12 4.23
2nd Class French 11 3.87
2nd Class Scottish 8 2.82
2nd Class Finnish 6 2.11
2nd Class Swedish 6 2.11
2nd Class Canadian 5 1.76
2nd Class South African 4 1.41
2nd Class Spanish 4 1.41
2nd Class Danish 3 1.06
2nd Class Italian 3 1.06
2nd Class Lithuanian 2 0.70
2nd Class Swiss 2 0.70
2nd Class Syrian/Lebanese 2 0.70
2nd Class Welsh 2 0.70
2nd Class Australian 1 0.35
2nd Class Belgian 1 0.35
2nd Class German 1 0.35
2nd Class Haitian 1 0.35
2nd Class Hungarian 1 0.35
2nd Class Japanese 1 0.35
2nd Class Norwegian 1 0.35
2nd Class Portuguese 1 0.35
3rd Class English 112 15.80
3rd Class Irish 105 14.81
3rd Class Swedish 89 12.55
3rd Class Syrian/Lebanese 83 11.71
3rd Class Other - Multiple 69 9.73
3rd Class Finnish 52 7.33
3rd Class Bulgarian 33 4.65
3rd Class Croatian 28 3.95
3rd Class Norwegian 24 3.39
3rd Class American 23 3.24
3rd Class Belgian 22 3.10
3rd Class Danish 7 0.99
3rd Class Scottish 6 0.85
3rd Class Welsh 6 0.85
3rd Class Canadian 5 0.71
3rd Class French 5 0.71
3rd Class Polish 5 0.71
3rd Class Swiss 5 0.71
3rd Class Bosnian 4 0.56
3rd Class Hong Kongese 4 0.56
3rd Class Italian 4 0.56
3rd Class Greek 3 0.42
3rd Class Channel Islander 2 0.28
3rd Class Chinese 2 0.28
3rd Class German 2 0.28
3rd Class Slovenian 2 0.28
3rd Class Australian 1 0.14
3rd Class Austrian 1 0.14
3rd Class Latvian 1 0.14
3rd Class Lithuanian 1 0.14
3rd Class Portuguese 1 0.14
3rd Class Sweedish 1 0.14
3rd Class Turkish 1 0.14
# Breakdown of passenger nationalities by class (>= 5%)
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(class, nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   filter(percent >= 5) %>% 
   arrange(class, desc(percent)) %>%
 kable(caption = "Breakdown of Passenger Nationalities by Class (>= 5%)",
       col.names = c("Class", "Nationality", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Breakdown of Passenger Nationalities by Class (>= 5%)
Class Nationality Count Percent
1st Class American 195 57.35
1st Class English 51 15.00
1st Class Canadian 27 7.94
2nd Class English 145 51.06
2nd Class Other - Multiple 25 8.80
2nd Class American 24 8.45
3rd Class English 112 15.80
3rd Class Irish 105 14.81
3rd Class Swedish 89 12.55
3rd Class Syrian/Lebanese 83 11.71
3rd Class Other - Multiple 69 9.73
3rd Class Finnish 52 7.33
# Average age by class
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   group_by(class) %>% 
   summarize(avg_age = mean(age), min_age = min(age), max_age = max(age)) %>%
 kable(caption = "Average Age by Class",
       col.names = c("Class", "Average Age", "Minimum Age", "Maximum Age"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Average Age by Class
Class Average Age Minimum Age Maximum Age
1st Class 39.12 0 71
2nd Class 30.01 0 71
3rd Class 25.12 0 74
# Survival rate by class
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(class, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, survived) %>%
 kable(caption = "Survival Rate by Class",
       col.names = c("Class", "Survived", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Survival Rate by Class
Class Survived Count Percent
1st Class Lost 123 37.96
1st Class Saved 201 62.04
2nd Class Lost 166 58.45
2nd Class Saved 118 41.55
3rd Class Lost 528 74.47
3rd Class Saved 181 25.53
# Survival rate by gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(gender, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(gender, survived) %>%
 kable(caption = "Survival Rate by Gender",
       col.names = c("Gender", "Survived", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
Survival Rate by Gender
Gender Survived Count Percent
Female Lost 127 27.25
Female Saved 339 72.75
Male Lost 690 81.08
Male Saved 161 18.92
# Survival rate by class and gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(class, gender, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, gender) %>%
 kable(caption = "Survival Rate by Class and Gender",
       col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
## `summarise()` has grouped output by 'class', 'gender'. You can override using the `.groups` argument.
Survival Rate by Class and Gender
Class Gender Survived Count Percent
1st Class Female Lost 5 3.47
1st Class Female Saved 139 96.53
1st Class Male Lost 118 65.56
1st Class Male Saved 62 34.44
2nd Class Female Lost 12 11.32
2nd Class Female Saved 94 88.68
2nd Class Male Lost 154 86.52
2nd Class Male Saved 24 13.48
3rd Class Female Lost 110 50.93
3rd Class Female Saved 106 49.07
3rd Class Male Lost 418 84.79
3rd Class Male Saved 75 15.21

Density ridges

surv_classhist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, class)) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Class", 
   x = "Age Distribution", y = "Passenger Class") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.69

surv_agehist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, gender)) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Gender", 
   x = "Age Distribution", y = "Passenger Gender") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.88

surv_ageclass_hist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, gender)) +
   facet_wrap(~class, nrow=3) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Class and Gender", 
   x = "Age Distribution", y = "Passenger Gender") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 6.54
## Picking joint bandwidth of 5.57
## Picking joint bandwidth of 2.96

Fares

Load data

fares <- import(here("data", "avgfare.csv")) %>% 
   clean_names()

Calculate inflation

p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)

p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)

p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)

p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)

p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)

p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)

p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)

p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)

p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)

p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)

p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)

Reshape fare data

faredat <- gather(fares, year, price, fare_1912:fare_2021)

faredat <- faredat %>% 
   mutate(year = case_when(year == "fare_1912" ~ 1912,
   year == "fare_1921" ~ 1921,
   year == "fare_1931" ~ 1931,
   year == "fare_1941" ~ 1941,
   year == "fare_1951" ~ 1951,
   year == "fare_1961" ~ 1961,
   year == "fare_1971" ~ 1971,
   year == "fare_1981" ~ 1981,
   year == "fare_1991" ~ 1991,
   year == "fare_2001" ~ 2001,
   year == "fare_2011" ~ 2011,
   year == "fare_2021" ~ 2021, ))

faredat$accommodation <- as.factor(faredat$accommodation)

faredat$accomodation <- factor(faredat$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))

Plot fare inflation

fare_graph <- faredat %>% 
   ggplot(aes(year, price, colour=accommodation)) +
   geom_line() +
   geom_point() +
   scale_colour_brewer(palette="Spectral") +
   facet_wrap(~ accommodation, 4, scales = "free") +
   xlim(1912,2021) +
   theme(panel.spacing = unit(1, "lines")) +
   theme_minimal()

fare_graph

ggplotly(fare_graph)

When taking inflation rates into consideration, we see that the average price for a 1st class cabin in 1912 was $150, which today would be $4241.74

References

Chan, Chung-hong, Geoffrey CH Chan, Thomas J. Leeper, and Jason Becker. 2021. Rio: A Swiss-Army Knife for Data File i/o.
Wickham, Hadley, and Jim Hester. 2021. Readr: Read Rectangular Text Data. https://CRAN.R-project.org/package=readr.
Wickham, Hadley, and Evan Miller. 2021. Haven: Import and Export ’SPSS’, ’Stata’ and ’SAS’ Files. https://CRAN.R-project.org/package=haven.